import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# reading the dataset and storing it in a dataframe
df = pd.read_csv('311_Service_Requests_from_2010_to_Present.csv', low_memory =False)
# checking the dataset
df.head()
| Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | ... | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 32310363 | 12/31/2015 11:59:45 PM | 01-01-16 0:55 | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Music/Party | Street/Sidewalk | 10034.0 | 71 VERMILYEA AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.865682 | -73.923501 | (40.86568153633767, -73.92350095571744) |
| 1 | 32309934 | 12/31/2015 11:59:44 PM | 01-01-16 1:26 | NYPD | New York City Police Department | Blocked Driveway | No Access | Street/Sidewalk | 11105.0 | 27-07 23 AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.775945 | -73.915094 | (40.775945312321085, -73.91509393898605) |
| 2 | 32309159 | 12/31/2015 11:59:29 PM | 01-01-16 4:51 | NYPD | New York City Police Department | Blocked Driveway | No Access | Street/Sidewalk | 10458.0 | 2897 VALENTINE AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.870325 | -73.888525 | (40.870324522111424, -73.88852464418646) |
| 3 | 32305098 | 12/31/2015 11:57:46 PM | 01-01-16 7:43 | NYPD | New York City Police Department | Illegal Parking | Commercial Overnight Parking | Street/Sidewalk | 10461.0 | 2940 BAISLEY AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.835994 | -73.828379 | (40.83599404683083, -73.82837939584206) |
| 4 | 32306529 | 12/31/2015 11:56:58 PM | 01-01-16 3:24 | NYPD | New York City Police Department | Illegal Parking | Blocked Sidewalk | Street/Sidewalk | 11373.0 | 87-14 57 ROAD | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.733060 | -73.874170 | (40.733059618956815, -73.87416975810375) |
5 rows × 53 columns
# checking the sahpe of dataset
df.shape
(300698, 53)
df['Created Date'] = pd.to_datetime(df['Created Date']) # coverting datetime object to time delta
df['Closed Date'] = pd.to_datetime(df['Closed Date']) # coverting datetime object to time delta
df.head()
| Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | ... | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 32310363 | 2015-12-31 23:59:45 | 2016-01-01 00:55:00 | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Music/Party | Street/Sidewalk | 10034.0 | 71 VERMILYEA AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.865682 | -73.923501 | (40.86568153633767, -73.92350095571744) |
| 1 | 32309934 | 2015-12-31 23:59:44 | 2016-01-01 01:26:00 | NYPD | New York City Police Department | Blocked Driveway | No Access | Street/Sidewalk | 11105.0 | 27-07 23 AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.775945 | -73.915094 | (40.775945312321085, -73.91509393898605) |
| 2 | 32309159 | 2015-12-31 23:59:29 | 2016-01-01 04:51:00 | NYPD | New York City Police Department | Blocked Driveway | No Access | Street/Sidewalk | 10458.0 | 2897 VALENTINE AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.870325 | -73.888525 | (40.870324522111424, -73.88852464418646) |
| 3 | 32305098 | 2015-12-31 23:57:46 | 2016-01-01 07:43:00 | NYPD | New York City Police Department | Illegal Parking | Commercial Overnight Parking | Street/Sidewalk | 10461.0 | 2940 BAISLEY AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.835994 | -73.828379 | (40.83599404683083, -73.82837939584206) |
| 4 | 32306529 | 2015-12-31 23:56:58 | 2016-01-01 03:24:00 | NYPD | New York City Police Department | Illegal Parking | Blocked Sidewalk | Street/Sidewalk | 11373.0 | 87-14 57 ROAD | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.733060 | -73.874170 | (40.733059618956815, -73.87416975810375) |
5 rows × 53 columns
df['Request_Closing_Time'] = df['Closed Date'] - df['Created Date'] # creating a column to store time taken to close the complain
df['Request_Closing_Time'] = df['Request_Closing_Time'].dt.seconds #converting time to seconds
df['Request_Closing_Time'] = df['Request_Closing_Time']/3600 # converting time to hours
df.head()
| Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | ... | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | Request_Closing_Time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 32310363 | 2015-12-31 23:59:45 | 2016-01-01 00:55:00 | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Music/Party | Street/Sidewalk | 10034.0 | 71 VERMILYEA AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | 40.865682 | -73.923501 | (40.86568153633767, -73.92350095571744) | 0.920833 |
| 1 | 32309934 | 2015-12-31 23:59:44 | 2016-01-01 01:26:00 | NYPD | New York City Police Department | Blocked Driveway | No Access | Street/Sidewalk | 11105.0 | 27-07 23 AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | 40.775945 | -73.915094 | (40.775945312321085, -73.91509393898605) | 1.437778 |
| 2 | 32309159 | 2015-12-31 23:59:29 | 2016-01-01 04:51:00 | NYPD | New York City Police Department | Blocked Driveway | No Access | Street/Sidewalk | 10458.0 | 2897 VALENTINE AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | 40.870325 | -73.888525 | (40.870324522111424, -73.88852464418646) | 4.858611 |
| 3 | 32305098 | 2015-12-31 23:57:46 | 2016-01-01 07:43:00 | NYPD | New York City Police Department | Illegal Parking | Commercial Overnight Parking | Street/Sidewalk | 10461.0 | 2940 BAISLEY AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | 40.835994 | -73.828379 | (40.83599404683083, -73.82837939584206) | 7.753889 |
| 4 | 32306529 | 2015-12-31 23:56:58 | 2016-01-01 03:24:00 | NYPD | New York City Police Department | Illegal Parking | Blocked Sidewalk | Street/Sidewalk | 11373.0 | 87-14 57 ROAD | ... | NaN | NaN | NaN | NaN | NaN | NaN | 40.733060 | -73.874170 | (40.733059618956815, -73.87416975810375) | 3.450556 |
5 rows × 54 columns
df.columns # Checking
Index(['Unique Key', 'Created Date', 'Closed Date', 'Agency', 'Agency Name',
'Complaint Type', 'Descriptor', 'Location Type', 'Incident Zip',
'Incident Address', 'Street Name', 'Cross Street 1', 'Cross Street 2',
'Intersection Street 1', 'Intersection Street 2', 'Address Type',
'City', 'Landmark', 'Facility Type', 'Status', 'Due Date',
'Resolution Description', 'Resolution Action Updated Date',
'Community Board', 'Borough', 'X Coordinate (State Plane)',
'Y Coordinate (State Plane)', 'Park Facility Name', 'Park Borough',
'School Name', 'School Number', 'School Region', 'School Code',
'School Phone Number', 'School Address', 'School City', 'School State',
'School Zip', 'School Not Found', 'School or Citywide Complaint',
'Vehicle Type', 'Taxi Company Borough', 'Taxi Pick Up Location',
'Bridge Highway Name', 'Bridge Highway Direction', 'Road Ramp',
'Bridge Highway Segment', 'Garage Lot Name', 'Ferry Direction',
'Ferry Terminal Name', 'Latitude', 'Longitude', 'Location',
'Request_Closing_Time'],
dtype='object')
Complaints = df[['City','Complaint Type','Location Type','Status','Borough','Latitude','Longitude', 'Request_Closing_Time']]
Complaints.head()
| City | Complaint Type | Location Type | Status | Borough | Latitude | Longitude | Request_Closing_Time | |
|---|---|---|---|---|---|---|---|---|
| 0 | NEW YORK | Noise - Street/Sidewalk | Street/Sidewalk | Closed | MANHATTAN | 40.865682 | -73.923501 | 0.920833 |
| 1 | ASTORIA | Blocked Driveway | Street/Sidewalk | Closed | QUEENS | 40.775945 | -73.915094 | 1.437778 |
| 2 | BRONX | Blocked Driveway | Street/Sidewalk | Closed | BRONX | 40.870325 | -73.888525 | 4.858611 |
| 3 | BRONX | Illegal Parking | Street/Sidewalk | Closed | BRONX | 40.835994 | -73.828379 | 7.753889 |
| 4 | ELMHURST | Illegal Parking | Street/Sidewalk | Closed | QUEENS | 40.733060 | -73.874170 | 3.450556 |
status = Complaints['Status'].value_counts() # getting complaint status
#Top 10 Complaint Types
groupby_complaint_type = Complaints['Complaint Type'].value_counts().nlargest(10)
groupby_complaint_type # getting highest complaints
Blocked Driveway 77044 Illegal Parking 75361 Noise - Street/Sidewalk 48612 Noise - Commercial 35577 Derelict Vehicle 17718 Noise - Vehicle 17083 Animal Abuse 7778 Traffic 4498 Homeless Encampment 4416 Noise - Park 4042 Name: Complaint Type, dtype: int64
# Graph representing complaints under each category
Complaints['Complaint Type'].value_counts().plot(kind ='bar', alpha=0.6 ,figsize =(15,10))
plt.title("No. of complaints recieved under each category", fontsize = 20, color='red',pad ='3.0')
plt.xlabel('Complaint Types ->', fontsize = 15)
plt.ylabel('No. of compliants ->' , fontsize = 15)
plt.show()
Top five complaints registrerd are:
#Top 10 cities with complaints
groupby_city = Complaints['City'].value_counts().nlargest(10)
groupby_city #getting city with highest complaints
BROOKLYN 98307 NEW YORK 65994 BRONX 40702 STATEN ISLAND 12343 JAMAICA 7296 ASTORIA 6330 FLUSHING 5971 RIDGEWOOD 5163 CORONA 4295 WOODSIDE 3544 Name: City, dtype: int64
# Graph representing cities and complaints registered
Complaints['City'].value_counts().plot(kind ='bar', alpha=0.6 ,figsize =(16,10),color='red')
plt.title("No. of complaints recived for each city", fontsize = 20, color='red',pad ='3.0')
plt.xlabel('City', fontsize = 15)
plt.ylabel('No. of compliants' , fontsize = 15)
plt.show()
Top 5 cities as per the number of complaints recieved:
Top_city = Complaints[Complaints['City']== 'BROOKLYN']
Blocked_Driveway = Top_city[Top_city['Complaint Type'] == 'Blocked Driveway']
Blocked_Driveway.head()
| City | Complaint Type | Location Type | Status | Borough | Latitude | Longitude | Request_Closing_Time | |
|---|---|---|---|---|---|---|---|---|
| 9 | BROOKLYN | Blocked Driveway | Street/Sidewalk | Closed | BROOKLYN | 40.623793 | -73.999539 | 1.383889 |
| 36 | BROOKLYN | Blocked Driveway | Street/Sidewalk | Closed | BROOKLYN | 40.670843 | -73.935556 | 0.312500 |
| 102 | BROOKLYN | Blocked Driveway | Street/Sidewalk | Closed | BROOKLYN | 40.689303 | -73.943395 | 0.546667 |
| 110 | BROOKLYN | Blocked Driveway | Street/Sidewalk | Closed | BROOKLYN | 40.638963 | -73.911903 | 4.416667 |
| 117 | BROOKLYN | Blocked Driveway | Street/Sidewalk | Closed | BROOKLYN | 40.630849 | -73.901790 | 4.578889 |
Blocked_Driveway['Complaint Type'].value_counts()
Blocked Driveway 28148 Name: Complaint Type, dtype: int64
Preparing a cluster map for the Blocked Driveway in Brooklyn
import folium
import math
from folium.plugins import MarkerCluster
import plotly.graph_objects as go
import plotly.express as px
import datetime
from plotly.subplots import make_subplots
# Base Map
Blyn = (40.6782,-73.9442)
m= folium.Map(location = Blyn, tile = 'stamen terrain', zoom_start =12)
# Cluster Map of Blocked Driveway compliants in Brooklyn
mc = MarkerCluster()
for idx,row in Blocked_Driveway.iterrows():
if not math.isnan(row['Longitude']) and not math.isnan(row['Latitude']):
mc.add_child(folium.Marker([row['Latitude'],row['Longitude']]))
m.add_child(mc)
Test_df = Complaints[['Complaint Type','Request_Closing_Time']]
Test_df.head()
| Complaint Type | Request_Closing_Time | |
|---|---|---|
| 0 | Noise - Street/Sidewalk | 0.920833 |
| 1 | Blocked Driveway | 1.437778 |
| 2 | Blocked Driveway | 4.858611 |
| 3 | Illegal Parking | 7.753889 |
| 4 | Illegal Parking | 3.450556 |
Test_df.isnull().sum() #checking null or NaN values in the dataframe
Complaint Type 0 Request_Closing_Time 2164 dtype: int64
# Test_final = Test_df.dropna() # dropping the rows with NaN values in Request_Closing_Time
# Test_final.isnull().sum() # Checking if NaN values are completely removed
Complaint Type 0 Request_Closing_Time 0 dtype: int64
# Average time to address each complaint types
Test_df1 = Test_df.groupby(by= 'Complaint Type').mean()
Test_df1.head()
| Request_Closing_Time | |
|---|---|
| Complaint Type | |
| Agency Issues | 5.260324 |
| Animal Abuse | 4.626220 |
| Animal in a Park | 0.834722 |
| Bike/Roller/Skate Chronic | 3.540047 |
| Blocked Driveway | 4.384702 |
Test_df1.sort_values(by='Request_Closing_Time', ascending = False)
Test_df1.head()
| Request_Closing_Time | |
|---|---|
| Complaint Type | |
| Agency Issues | 5.260324 |
| Animal Abuse | 4.626220 |
| Animal in a Park | 0.834722 |
| Bike/Roller/Skate Chronic | 3.540047 |
| Blocked Driveway | 4.384702 |
# Graph representing cities and complaints registered
Test_df1.sort_values(by='Request_Closing_Time').plot(kind ='bar', alpha=0.6 ,figsize =(16,12),color='green')
plt.title("Average closing time across closing time", fontsize = 20, color='red',pad ='3.0')
plt.xlabel('Complaint Type', fontsize = 15)
plt.ylabel('Request_Closing_Time' , fontsize = 15)
plt.show()
#Top 10 comaplints with largest average closing time
groupby_closing_time = Test_df1['Request_Closing_Time'].nlargest(5)
Avg_closing_time = pd.DataFrame(groupby_closing_time)
Avg_closing_time
| Request_Closing_Time | |
|---|---|
| Complaint Type | |
| Graffiti | 5.664526 |
| Derelict Vehicle | 5.588838 |
| Agency Issues | 5.260324 |
| Animal Abuse | 4.626220 |
| Blocked Driveway | 4.384702 |
#Top 10 comaplints with less average closing time
groupby_closing_time = Test_df1['Request_Closing_Time'].nsmallest(5)
Avg_closing_time = pd.DataFrame(groupby_closing_time)
Avg_closing_time
| Request_Closing_Time | |
|---|---|
| Complaint Type | |
| Animal in a Park | 0.834722 |
| Posting Advertisement | 1.938766 |
| Illegal Fireworks | 2.618282 |
| Noise - House of Worship | 2.960790 |
| Noise - Commercial | 2.991226 |
Borough_df =Complaints[['Borough','Complaint Type']]
Borough_df.head()
| Borough | Complaint Type | |
|---|---|---|
| 0 | MANHATTAN | Noise - Street/Sidewalk |
| 1 | QUEENS | Blocked Driveway |
| 2 | BRONX | Blocked Driveway |
| 3 | BRONX | Illegal Parking |
| 4 | QUEENS | Illegal Parking |
# Graph representing Borough and complaints registered
Borough_df['Borough'].value_counts().plot(kind ='bar', alpha=0.6 ,figsize =(16,10),color='indigo')
plt.title("No. of complaints recived for each Borough", fontsize = 20, color='red',pad ='3.0')
plt.xlabel('Borough', fontsize = 15)
plt.ylabel('No. of compliants' , fontsize = 15)
plt.show()
# Order of the complaint types based on the average ‘Request_Closing_Time’, grouping them for different locations.
df3 = Complaints[['City','Complaint Type','Request_Closing_Time']]
df4 = df3.groupby(by=['City','Complaint Type']).mean()
df4.sort_values(by=['City','Request_Closing_Time','Complaint Type'])
| Request_Closing_Time | ||
|---|---|---|
| City | Complaint Type | |
| ARVERNE | Drinking | 0.238611 |
| Vending | 0.483333 | |
| Urinating in Public | 0.691944 | |
| Panhandling | 1.033333 | |
| Noise - Park | 1.283333 | |
| ... | ... | ... |
| Woodside | Noise - Commercial | 2.394167 |
| Noise - Street/Sidewalk | 3.411278 | |
| Derelict Vehicle | 4.966667 | |
| Illegal Parking | 4.979203 | |
| Blocked Driveway | 6.406212 |
764 rows × 1 columns
# Importing requires libraries for hypothesis testing
import pingouin as pg
from scipy.stats import chi2_contingency,chi2
D:\Anaconda3\lib\site-packages\outdated\utils.py:14: OutdatedPackageWarning: The package pingouin is out of date. Your version is 0.3.9, the latest is 0.3.10. Set the environment variable OUTDATED_IGNORE=1 to disable these warnings. return warn(
Complaint_Response_Time = Complaints[['Complaint Type','Request_Closing_Time']]
aov = pg.anova(dv = 'Request_Closing_Time', between ='Complaint Type', data = Complaint_Response_Time, detailed = True)
aov
| Source | SS | DF | MS | F | p-unc | np2 | |
|---|---|---|---|---|---|---|---|
| 0 | Complaint Type | 1.407689e+05 | 22 | 6398.586790 | 443.372575 | 0.0 | 0.031642 |
| 1 | Within | 4.307999e+06 | 298511 | 14.431625 | NaN | NaN | NaN |
# creating a new dataframe with Complaint Type and Location Type
group = Complaints[['Location Type','Complaint Type']]
group_count = group.count()
# checking NaN values
chi_test = group.fillna(value=0)
chi_test.head()
| Location Type | Complaint Type | |
|---|---|---|
| 0 | Street/Sidewalk | Noise - Street/Sidewalk |
| 1 | Street/Sidewalk | Blocked Driveway |
| 2 | Street/Sidewalk | Blocked Driveway |
| 3 | Street/Sidewalk | Illegal Parking |
| 4 | Street/Sidewalk | Illegal Parking |
# checking size of dataset
chi_test.shape
(300698, 2)
# # dropping NaN values
# Chi_test = df_6.dropna()
# Chi_test.head()
# selecting field to test for the association
myField1 = chi_test['Location Type']
myField2 = chi_test['Complaint Type']
#To get a quick look at the counts from these we can use Pandas 'crosstab'
myCrosstable = pd.crosstab(myField1,myField2)
myCrosstable.head()
| Complaint Type | Agency Issues | Animal Abuse | Animal in a Park | Bike/Roller/Skate Chronic | Blocked Driveway | Derelict Vehicle | Disorderly Youth | Drinking | Ferry Complaint | Graffiti | ... | Noise - House of Worship | Noise - Park | Noise - Street/Sidewalk | Noise - Vehicle | Panhandling | Posting Advertisement | Squeegee | Traffic | Urinating in Public | Vending |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Location Type | |||||||||||||||||||||
| 0 | 6 | 3 | 0 | 0 | 37 | 8 | 0 | 1 | 0 | 0 | ... | 2 | 1 | 11 | 3 | 0 | 1 | 0 | 5 | 1 | 2 |
| Bridge | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Club/Bar/Restaurant | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 366 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | 0 |
| Commercial | 0 | 62 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Ferry | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 24 columns
myCrosstable.shape
(19, 24)
chi, pval, dof, exp = chi2_contingency(myCrosstable)
print('p-value is: ', pval)
significance = 0.05
p = 1 - significance
critical_value = chi2.ppf(p, dof)
print('chi=%.6f, critical value=%.6f\n' % (chi, critical_value))
if chi > critical_value:
print("""At %.2f level of significance, we reject the null hypotheses and accept H1.They are related.""" % (significance))
else:
print("""At %.2f level of significance, we accept the null hypotheses. They aren't related.""" % (significance))
p-value is: 0.0 chi=1652072.540954, critical value=462.440110 At 0.05 level of significance, we reject the null hypotheses and accept H1.They are related.
from scipy import stats
alpha = stats.chi2.cdf(1652072.540954,414)
alpha
1.0